**************************************************
*  PREPARE L2 DATA
**************************************************

clear
use "IL-names-addresses-2011-2020.dta", clear

sort suid year

* Split birthdate and derive year of birth
split birthdate, parse("/")
destring birthdate3, gen(yob)

* Fill missing yob with modal year by individual
bys suid: egen mode_yob = mode(yob)
replace yob = mode_yob if missing(yob)
drop mode_yob

* Keep only one record per person 
keep suid firstname middle_init lastname sex yob
duplicates drop
bys suid: gen record_count = _N

gen has_middle = !missing(middle_init)
bys suid: egen any_middle = max(has_middle)
drop if record_count > 1 & any_middle == 1 & has_middle == 0

bys suid (lastname firstname): keep if _n == 1

* Clean up
drop record_count has_middle any_middle
compress

* Save cleaned L2 file
save "IL-names-addresses-2011-2020-unique-suids.dta", replace


**************************************************
*  MATCH OFFICER DATA TO L2 NAMES
**************************************************

clear
use "assignment_names.dta", clear

* Ensure year is numeric
gen assgn_year = real(substr(date,1,4))
replace year = assgn_year if missing(year)
drop assgn_year

* Keep only identifiers and de-duplicate
keep uid last_name first_name middle_initial yob gender year
duplicates drop
* Rename for merge
rename (last_name first_name middle_initial gender) (lastname firstname middle_init sex)

* Drop missing uids
drop if missing(uid)

* Perform fuzzy matching with reclink
reclink ///
    lastname firstname middle_init sex yob ///
    using "IL-names-addresses-2011-2020-unique-suids.dta", ///
    idmaster(uid) idusing(suid) gen(score_match) required(sex yob)

* Keep strong matches
keep if score_match >= 0.90
*---- Inspect lower macthing scores and manually adjudicate

* For duplicates, keep only highest match
bys uid: egen max_score = max(score_match)
keep if score_match == max_score
drop max_score

* Merge back to get full L2 address data
merge 1:m suid using "IL-names-addresses-2011-2020.dta"
keep if _merge == 3
drop _merge

* Harmonize and fill birthdate
replace birthdate = "" if birthdate == "NA"
bys suid: egen mode_bd = mode(birthdate)
replace birthdate = mode_bd if missing(birthdate)
drop mode_bd

* Standardize city
replace votecity = "CHICAGO" if upper(votecity) == "CHICAGO"

* Rename L2 vars for clarity
rename (suid lastname firstname middle_init yob birthdate) (suid_L2 lastname_L2 firstname_L2 middle_init_L2 yob_L2 birthdate_L2)

* Carry forward/backward to fill gaps
xtset uid year
tsfill, full
sort uid year
foreach var in suid_L2 sex yob_L2 lastname_L2 firstname_L2 middle_init_L2 birthdate_L2 score_match ///
               votehousenumber votepredirection votestreetname votestreettype votezip votecity votestate {
    by uid: carryforward `var', replace
}

* Carry backward by reversing time
gen minus_year = -year
xtset uid minus_year
sort uid minus_year
foreach var in suid_L2 sex yob_L2 lastname_L2 firstname_L2 middle_init_L2 birthdate_L2 score_match ///
               votehousenumber votepredirection votestreetname votestreettype votezip votecity votestate {
    by uid: carryforward `var', replace
}
drop minus_year

**************************************************
*  GEOCODE ADDRESSES
**************************************************

* Build full address string
tostring votehousenumber, replace
gen fulladdress = ///
    votehousenumber + " " + votepredirection + " " + ///
    votestreetname + " " + votestreettype + ", " + ///
    votecity + ", " + votestate + " " + votezip

* Assign unique address IDs
egen address_id = group(fulladdress)

* Export unique addresses for geocoding
preserve
    keep address_id fulladdress
    duplicates drop
    export delimited "unique-addresses-to-geocode.csv", replace
restore

* (Outside Stata: geocode in Geocodio, save as CSV)

* Import geocoded results
preserve
    import delimited "unique-addresses-geocoded.csv", clear
    keep fulladdress address_id latitude longitude fullfipsblock
    rename (latitude longitude fullfipsblock) (lat lon block)
    save "unique-addresses-geocoded.dta", replace
restore

* Merge geocodes back
merge m:1 address_id using "unique-addresses-geocoded.dta"
drop _merge

**************************************************
*  FINAL MERGES AND CLEANUP
**************************************************

* Merge back to assignments
merge 1:m uid year using "assignment_names.dta"
gen has_address = (_merge == 3)
drop if _merge == 1
drop _merge

* Convert assignment date to Stata date
gen assgn_date = date(date, "YMD")
format assgn_date %td
gen assgn_year = year(assgn_date)

* Merge policing district & community area via block
merge m:1 block using "sj-2010blocks-beats-CAs.dta"
drop if _merge == 2
drop _merge

* Prefix address fields
foreach v in block lat lon pol_district pol_beat ca_num ca_name {
    rename `v' add_`v'
}

compress

* Save final dataset
save "assignment-with-L2-addresses.dta", replace
